In part I, we tested four models: kNN, LDA, QDA, and Logistic Regression. We use the dataset from part I as the training dataset fo part II of the project. We compare the following models: kNN, LDA, QDA, Logistic Regression, Random Forest, and SVM.
We attach the packages required for the analysis.
library(tidyverse) # data manipulation
library(ggplot2) # plots
library(caret) # model training
library(ROCR) # ROC tools
library(viridis) # ggplot coloring
library(e1071) # SVM
library(kernlab) # SVM
# load training data from part 1
tuning <- read.csv("HaitiPixels.csv")
names(tuning) <- tolower(names(tuning)) # lowercase col names
# Make response variable binary
tuning$class <- ifelse(tuning$class == "Blue Tarp", "tarp", "other") %>% factor()
# RGB variables to numeric
tuning$red <- as.numeric(tuning$red)
tuning$green <- as.numeric(tuning$green)
tuning$blue <- as.numeric(tuning$blue)
# preview
summary(tuning)
## class red green blue
## other:61219 Min. : 48 Min. : 48.0 Min. : 44.0
## tarp : 2022 1st Qu.: 80 1st Qu.: 78.0 1st Qu.: 63.0
## Median :163 Median :148.0 Median :123.0
## Mean :163 Mean :153.7 Mean :125.1
## 3rd Qu.:255 3rd Qu.:226.0 3rd Qu.:181.0
## Max. :255 Max. :255.0 Max. :255.0
# boxplots of each RGB variable by surface type
tuning %>% pivot_longer(!class, names_to = "variable", values_to = "value") %>%
ggplot(aes(x=class, y=value))+geom_boxplot(aes(col = "white", fill = variable))+
scale_fill_manual(values = c(red = "#FF4E3F", green = "#50E23F", blue = "#504EB5"), guide = 'none')+
scale_color_manual(values = "white", guide = 'none')+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
facet_wrap(.~variable)+ggtitle("Training Data")
tuning %>% pivot_longer(!class, names_to = "variable", values_to = "value") %>%
ggplot()+
geom_density(col = NA, alpha = 0.5, aes(x=value, fill = variable))+
scale_fill_manual(values = c(red = "#FF4E3F", green = "#50E23F", blue = "#504EB5"), guide = 'none')+
xlim(0,255)+ ylim(0, 0.025)+
geom_boxplot(lwd = 1, fatten = 1, fill = NA, width = 0.005, aes(x=value, y = 0.005))+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
axis.title.y = element_blank())+
facet_wrap(.~variable, nrow = 3)+ggtitle("Training Data")
tuning %>% pivot_longer(!class, names_to = "variable", values_to = "value") %>%
ggplot(aes(x=value, y=class))+
geom_violin(col = "white", aes(fill = variable))+
scale_fill_manual(values = c(red = "#9C2C34", green = "#38c08c", blue = "#1E5B79"), guide = 'none')+
geom_boxplot(col= "white", alpha=0.01, width = 0.05, fatten = 1)+
#stat_summary(fun.data = data_summary)+
xlim(0,255)+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
axis.title.y = element_blank(), axis.title.x = element_blank())+
facet_wrap(.~variable)+ggtitle("Training Data")
# Create interactive plot
options(rgl.useNULL=TRUE)
plotids <- rgl::plot3d(x = tuning$red,
y = tuning$green,
z = tuning$blue,
col = rgb(tuning$red, tuning$green, tuning$blue, maxColorValue = 255),
xlab = "Red",
ylab = "Green",
zlab = "Blue",
xlim = c(0,255),
ylim = c(0,255),
zlim = c(0,255),
main = "Training Data")
rgl::rglwidget()
There are seven files to read in identified by the numbers 057, 067, 069, and 078.For each of 067, 069, and 078 there are a pair of files– one for tarps and one for non-tarps. File 057 has non-tarp observations.
We create a function to process the holdout data .txt files.
# write a function to process text files
processPixels <- function(fileName){
if(!is.character(fileName)){
stop("Please provide string input.")
} else if(!grepl(".txt", fileName)){
stop("File Name invalid.")
}
dt <- read.table(fileName, skip = 8, header = F) # read in data from file
dt <- dt[, (ncol(dt)-2):ncol(dt)] # keep only RGB cols
names(dt) <- c("red", "green", "blue") # add col names
# if NOT or NON in file name, label data as other, else tarp
if(grepl("NOT", fileName) | grepl("NON", fileName)){
dt$class <- rep("other", nrow(dt))
} else{
dt$class <- rep("tarp", nrow(dt))
}
dt$class <- dt$class %>% factor()
dt$red <- as.numeric(dt$red)
dt$green <- as.numeric(dt$green)
dt$blue <- as.numeric(dt$blue)
return(dt)
}
Now we process the text files and combine into a single holdout set. It is assumed that the text files exist in the present directory.
other057 <- processPixels("orthovnir057_ROI_NON_Blue_Tarps.txt")
tarp067 <- processPixels("orthovnir067_ROI_Blue_Tarps.txt")
other067 <- processPixels("orthovnir067_ROI_NOT_Blue_Tarps.txt")
tarp069 <- processPixels("orthovnir069_ROI_Blue_Tarps.txt")
other069 <- processPixels("orthovnir069_ROI_NOT_Blue_Tarps.txt")
tarp078 <- processPixels("orthovnir078_ROI_Blue_Tarps.txt")
other078 <- processPixels("orthovnir078_ROI_NON_Blue_Tarps.txt")
holdout <- rbind(other057, tarp067, other067, tarp069, other069, tarp078, other078)
rm(other057, tarp067, other067, tarp069, other069, tarp078, other078)
write.csv(holdout, "holdout.csv")
summary(holdout)
## red green blue class
## Min. : 27.0 Min. : 28.0 Min. : 25.00 other:1989697
## 1st Qu.: 76.0 1st Qu.: 71.0 1st Qu.: 55.00 tarp : 14480
## Median :107.0 Median : 91.0 Median : 66.00
## Mean :118.3 Mean :105.4 Mean : 82.36
## 3rd Qu.:139.0 3rd Qu.:117.0 3rd Qu.: 88.00
## Max. :255.0 Max. :255.0 Max. :255.00
Preview the holdout data.
# boxplots of each RGB variable by surface type
holdout %>% pivot_longer(!class, names_to = "variable", values_to = "value") %>%
ggplot(aes(x=class, y=value))+geom_boxplot(aes(col = "white", fill = variable))+
scale_fill_manual(values = c(red = "#8B4737", green = "#4C7537", blue = "#4C4758"), guide = 'none')+
scale_color_manual(values = "white", guide = 'none')+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
facet_wrap(.~variable)+ggtitle("Holdout Data")
holdout %>% pivot_longer(!class, names_to = "variable", values_to = "value") %>%
ggplot()+
geom_density(col = NA, alpha = 0.5, aes(x=value, fill = variable))+
scale_fill_manual(values = c(red = "#8B4737", green = "#4C7537", blue = "#4C4758"), guide = 'none')+
xlim(0,255)+ylim(0,0.025)+
geom_boxplot(lwd = 1, fatten = 1, fill = NA, width = 0.005, aes(x=value, y = 0.005))+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
axis.title.y = element_blank())+
facet_wrap(.~variable, nrow = 3)+ggtitle("Holdout Data")
holdout %>% pivot_longer(!class, names_to = "variable", values_to = "value") %>%
ggplot(aes(x=value, y=class))+
geom_violin(col = "white", aes(fill = variable))+
scale_fill_manual(values = c(red = "#9C2C34", green = "#38c08c", blue = "#1E5B79"), guide = 'none')+
geom_boxplot(col= "white", alpha=0.01, width = 0.05, fatten = 1)+
#stat_summary(fun.data = data_summary)+
xlim(0,255)+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
axis.title.y = element_blank(), axis.title.x = element_blank())+
facet_wrap(.~variable)+ggtitle("Holdout Data")
# Create interactive plot
# The holdout data is ~ 2e10^6 observation. The plot takes a long time to generate.
# We randomly sample 5% of the holdout set and plot only these points.
set.seed(199)
holdout.sample <- holdout[sample(nrow(holdout), round(0.005*nrow(holdout))), ]
options(rgl.useNULL=TRUE)
plotids <- rgl::plot3d(x = holdout.sample$red,
y = holdout.sample$green,
z = holdout.sample$blue,
col = rgb(holdout.sample$red,
holdout.sample$green,
holdout.sample$blue,
maxColorValue = 255),
xlab = "Red",
ylab = "Green",
zlab = "Blue",
xlim = c(0,255),
ylim = c(0,255),
zlim = c(0,255),
main = "Holdout Data")
rgl::rglwidget()
Outline methodology for splitting training data.
set.seed(199)
train <- createDataPartition(tuning$class, p = 0.5, list = F)
# create new dataframes of test data to be passed to predict()
testX <- tuning[-train, ] %>% dplyr::select(-class)
testLabels <- tuning[-train, 'class']
# split holdout data similarly
holdoutX <- holdout %>% select(-class)
holdoutLabels <- holdout$class
# create list of seeds for kNN and other 1-parameter models
set.seed(199)
seeds <- vector(mode = "list", length = 51)
for(i in 1:(length(seeds)-1)) seeds[[i]]<- sample.int(n=1000, 20)
seeds[[51]]<-sample.int(1000, 1)
# create list of seeds for SVM (RBF kernel) and other 2-parameter models
set.seed(199)
# ... how do I do this ...
# trainControl for tuning
ctrl <- trainControl(method= "repeatedcv", repeats = 5, seeds = seeds)
We use the training data to tune the model. We want to identify the value of \(k\) that provides the lowest cross-validation error. The train() function from the caret package is used.
knn.tuning <- train(class~.,
data = tuning,
subset = train,
method = "knn",
metric = "Accuracy",
trControl = ctrl,
preProcess = c("center","scale"),
tuneLength = 15)
#tuneGrid = expand.grid(k = seq(5,24)))
knn.tuning
## k-Nearest Neighbors
##
## 63241 samples
## 3 predictor
## 2 classes: 'other', 'tarp'
##
## Pre-processing: centered (3), scaled (3)
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 28459, 28459, 28459, 28459, 28458, 28459, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.9967996 0.9485798
## 7 0.9967363 0.9473888
## 9 0.9966478 0.9459434
## 11 0.9966478 0.9458139
## 13 0.9966098 0.9451114
## 15 0.9966667 0.9460311
## 17 0.9965655 0.9443363
## 19 0.9965403 0.9439034
## 21 0.9965276 0.9436469
## 23 0.9964137 0.9418620
## 25 0.9964201 0.9418399
## 27 0.9964074 0.9415390
## 29 0.9963315 0.9402337
## 31 0.9962809 0.9393528
## 33 0.9961734 0.9375580
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 5.
10-fold cross-validation, repeated five times, selects k = 5.
knn.preds <- predict(knn.tuning, newdata = testX)
confusionMatrix(knn.preds, testLabels, positive = "tarp")
## Confusion Matrix and Statistics
##
## Reference
## Prediction other tarp
## other 30547 29
## tarp 62 982
##
## Accuracy : 0.9971
## 95% CI : (0.9965, 0.9977)
## No Information Rate : 0.968
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9542
##
## Mcnemar's Test P-Value : 0.0007951
##
## Sensitivity : 0.97132
## Specificity : 0.99797
## Pos Pred Value : 0.94061
## Neg Pred Value : 0.99905
## Prevalence : 0.03197
## Detection Rate : 0.03106
## Detection Prevalence : 0.03302
## Balanced Accuracy : 0.98464
##
## 'Positive' Class : tarp
##
The Accuracy on the test subset is 0.9971
For sake of comparison, we make predictions on the holdout set using both models.
# Predict labels on holdout data. Create confusion matrix
knn.holdout.preds <- predict(knn.tuning, newdata = holdoutX)
confusionMatrix(knn.holdout.preds, holdoutLabels, positive = "tarp")
## Confusion Matrix and Statistics
##
## Reference
## Prediction other tarp
## other 1974768 1918
## tarp 14929 12562
##
## Accuracy : 0.9916
## 95% CI : (0.9915, 0.9917)
## No Information Rate : 0.9928
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.5948
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.867541
## Specificity : 0.992497
## Pos Pred Value : 0.456950
## Neg Pred Value : 0.999030
## Prevalence : 0.007225
## Detection Rate : 0.006268
## Detection Prevalence : 0.013717
## Balanced Accuracy : 0.930019
##
## 'Positive' Class : tarp
##
# Predict Probabilities for ROC
knn.holdout.probs <- predict(knn.tuning, newdata = holdoutX, type = "prob")
knn.rates <- prediction(knn.holdout.probs[2], holdoutLabels)
knn.ROC <- performance(knn.rates, measure = "tpr", x.measure = "fpr")
knn.auc <- performance(knn.rates, measure = "auc")
knn.auc@y.values #auc = 0.9625007
## [[1]]
## [1] 0.9625007
# colorize ROC
knn.roc.df <- data.frame(c("FPR" = knn.ROC@x.values, "TPR" = knn.ROC@y.values, "Threshold" = knn.ROC@alpha.values))
knn.roc.df$Threshold[1] <- 1 # replace infinity
knn.roc.df %>% ggplot(aes(FPR, TPR))+
geom_line(aes(color=Threshold), lwd = 2)+
scale_colour_viridis(option = "plasma")+
geom_abline(lwd = 1, color = "gray")+
theme_bw()+
theme(axis.line = element_line(color='gray'),
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank())+
ggtitle("ROC of kNN (k = 5)")+
annotate("text", x = 0.75, y = 0.25, label = "AUC = 0.9625")
# create grid
boundary.grid <- expand.grid(red = seq(25,255,length.out=151),
green = seq(25,255,length.out=151),
blue = seq(25,255,length.out=151))
# predict over boundary.grid
boundary.probs <- predict(knn.tuning, boundary.grid, type = "prob")
boundary.df <- cbind(boundary.grid, boundary.probs[2])
colPal <- c("#ff822e", "#ff9751", "#ffac74", "#ffc197", "#ffd5b9",
"#ffeadc","#ffffff", "#dfedf9", "#c0dbf2", "#a0c9ec",
"#80b6e6", "#61a4df", "#4192d9")
new.holdout <- holdout
new.holdout$hitmiss <- factor(ifelse(knn.holdout.preds==holdout$class,'hit', 'miss'))
ggplot(data = NULL)+
geom_contour_filled(data = slice(boundary.df,which(boundary.df$green==117)),
breaks = seq(0,1.0001,length.out=14),
aes(x=red, y=blue, z=tarp), bins = 13)+
scale_fill_manual(values = colPal,guide = "none")+
theme(axis.line = element_line(color='gray'),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank())+
geom_jitter(data = new.holdout[which(new.holdout$green==117), ],
size = 2, stroke = 1.3,
aes(x=red, y=blue, color=class, shape = hitmiss))+
scale_color_manual(values=c("#964B00", "#153250"))+
scale_shape_manual(values=c(20,4))+
ggtitle("kNN Decision Boundary [Blue against Red]")
ggplot(data = NULL)+
geom_contour_filled(data = slice(boundary.df,which(boundary.df$red==140)),
breaks = seq(0,1.0001,length.out=14),
aes(x=green, y=blue, z=tarp), bins = 13)+
scale_fill_manual(values = colPal,guide = "none")+
theme(axis.line = element_line(color='gray'),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank())+
geom_jitter(data = new.holdout[which(new.holdout$red==140), ],
size = 2, stroke = 1.3,
aes(x=green, y=blue, color=class, shape = hitmiss))+
scale_color_manual(values=c("#964B00", "#153250"))+
scale_shape_manual(values=c(20,4))+
ggtitle("kNN Decision Boundary [Blue against Green]")
Again, we use train() to fit the LDA model. There are no parameters to tune.
lda.ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 5, seeds = seeds)
lda.tuning <- train(class~.,
data = tuning,
subset = train,
method = "lda",
metric = "Accuracy",
trControl = lda.ctrl)
lda.tuning
## Linear Discriminant Analysis
##
## 63241 samples
## 3 predictor
## 2 classes: 'other', 'tarp'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 28459, 28459, 28459, 28459, 28459, 28459, ...
## Resampling results:
##
## Accuracy Kappa
## 0.98353 0.7459956
We make predictions on the test subset.
lda.preds <- predict(lda.tuning, newdata = testX)
confusionMatrix(lda.preds, testLabels, positive = "tarp")
## Confusion Matrix and Statistics
##
## Reference
## Prediction other tarp
## other 30307 191
## tarp 302 820
##
## Accuracy : 0.9844
## 95% CI : (0.983, 0.9857)
## No Information Rate : 0.968
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7608
##
## Mcnemar's Test P-Value : 7.265e-07
##
## Sensitivity : 0.81108
## Specificity : 0.99013
## Pos Pred Value : 0.73084
## Neg Pred Value : 0.99374
## Prevalence : 0.03197
## Detection Rate : 0.02593
## Detection Prevalence : 0.03548
## Balanced Accuracy : 0.90061
##
## 'Positive' Class : tarp
##
lda.holdout.preds <- predict(lda.tuning, newdata = holdoutX)
confusionMatrix(lda.holdout.preds, holdoutLabels, positive = "tarp")
## Confusion Matrix and Statistics
##
## Reference
## Prediction other tarp
## other 1955460 2326
## tarp 34237 12154
##
## Accuracy : 0.9818
## 95% CI : (0.9816, 0.9819)
## No Information Rate : 0.9928
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3926
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.839365
## Specificity : 0.982793
## Pos Pred Value : 0.261990
## Neg Pred Value : 0.998812
## Prevalence : 0.007225
## Detection Rate : 0.006064
## Detection Prevalence : 0.023147
## Balanced Accuracy : 0.911079
##
## 'Positive' Class : tarp
##
# Predict Probabilities for ROC
lda.holdout.probs <- predict(lda.tuning, newdata = holdoutX, type = "prob")
lda.rates <- prediction(lda.holdout.probs[2], holdoutLabels)
lda.ROC <- performance(lda.rates, measure = "tpr", x.measure = "fpr")
lda.auc <- performance(lda.rates, measure = "auc")
lda.auc@y.values #auc = 0.9921876
## [[1]]
## [1] 0.9921876
# colorize ROC
lda.roc.df <- data.frame(c("FPR" = lda.ROC@x.values, "TPR" = lda.ROC@y.values, "Threshold" = lda.ROC@alpha.values))
lda.roc.df$Threshold[1] <- 1 # replace infinity
lda.roc.df %>% ggplot(aes(FPR, TPR))+
geom_line(aes(color=Threshold), lwd = 2)+
scale_colour_viridis(option = "plasma")+
geom_abline(lwd = 1, color = "gray")+
theme_bw()+
theme(axis.line = element_line(color='gray'),
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank())+
ggtitle("ROC of LDA")+
annotate("text", x = 0.75, y = 0.25, label = "AUC = 0.9922")
# predict over boundary.grid.
boundary.probs <- predict(lda.tuning, boundary.grid, type = "prob")
boundary.df <- cbind(boundary.grid, boundary.probs[2])
new.holdout <- holdout
new.holdout$hitmiss <- factor(ifelse(lda.holdout.preds==holdout$class,'hit', 'miss'))
ggplot(data = NULL)+
geom_contour_filled(data = slice(boundary.df,which(boundary.df$green==117)),
breaks = seq(0,1.0001,length.out=14),
aes(x=red, y=blue, z=tarp), bins = 13)+
scale_fill_manual(values = colPal,guide = "none")+
theme(axis.line = element_line(color='gray'),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank())+
geom_jitter(data = new.holdout[which(new.holdout$green==117), ],
size = 2, stroke = 1.3,
aes(x=red, y=blue, color=class, shape = hitmiss))+
scale_color_manual(values=c("#964B00", "#153250"))+
scale_shape_manual(values=c(20,4))+
ggtitle("LDA Decision Boundary [Blue against Red]")
ggplot(data = NULL)+
geom_contour_filled(data = slice(boundary.df,which(boundary.df$red==140)),
breaks = seq(0,1.0001,length.out=14),
aes(x=green, y=blue, z=tarp), bins = 13)+
scale_fill_manual(values = colPal,guide = "none")+
theme(axis.line = element_line(color='gray'),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank())+
geom_jitter(data = new.holdout[which(new.holdout$red==140), ],
size = 2, stroke = 1.3,
aes(x=green, y=blue, color=class, shape = hitmiss))+
scale_color_manual(values=c("#964B00", "#153250"))+
scale_shape_manual(values=c(20,4))+
ggtitle("LDA Decision Boundary [Blue against Green]")
We fit a QDA model with train().
qda.ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 5, seeds = seeds)
qda.tuning <- train(class~.,
data = tuning,
subset = train,
method = "qda",
metric = "Accuracy",
trControl = qda.ctrl)
qda.tuning
## Quadratic Discriminant Analysis
##
## 63241 samples
## 3 predictor
## 2 classes: 'other', 'tarp'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 28458, 28459, 28459, 28459, 28459, 28459, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9941811 0.8974676
Make predictions on the tuning subset.
qda.preds <- predict(qda.tuning, newdata = testX)
confusionMatrix(qda.preds, testLabels, positive = 'tarp')
## Confusion Matrix and Statistics
##
## Reference
## Prediction other tarp
## other 30596 145
## tarp 13 866
##
## Accuracy : 0.995
## 95% CI : (0.9942, 0.9958)
## No Information Rate : 0.968
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9138
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.85658
## Specificity : 0.99958
## Pos Pred Value : 0.98521
## Neg Pred Value : 0.99528
## Prevalence : 0.03197
## Detection Rate : 0.02739
## Detection Prevalence : 0.02780
## Balanced Accuracy : 0.92808
##
## 'Positive' Class : tarp
##
Use the QDA model to make predictions on holdout.
qda.holdout.preds <- predict(qda.tuning, newdata = holdoutX)
confusionMatrix(qda.holdout.preds, holdoutLabels, positive = 'tarp')
## Confusion Matrix and Statistics
##
## Reference
## Prediction other tarp
## other 1985998 4139
## tarp 3699 10341
##
## Accuracy : 0.9961
## 95% CI : (0.996, 0.9962)
## No Information Rate : 0.9928
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7232
##
## Mcnemar's Test P-Value : 7.099e-07
##
## Sensitivity : 0.714157
## Specificity : 0.998141
## Pos Pred Value : 0.736538
## Neg Pred Value : 0.997920
## Prevalence : 0.007225
## Detection Rate : 0.005160
## Detection Prevalence : 0.007005
## Balanced Accuracy : 0.856149
##
## 'Positive' Class : tarp
##
# Predict Probabilities for ROC
qda.holdout.probs <- predict(qda.tuning, newdata = holdoutX, type = "prob")
qda.rates <- prediction(qda.holdout.probs[2], holdoutLabels)
qda.ROC <- performance(qda.rates, measure = "tpr", x.measure = "fpr")
qda.auc <- performance(qda.rates, measure = "auc")
qda.auc@y.values #auc = 0.9926837
## [[1]]
## [1] 0.9926837
# colorize ROC
qda.roc.df <- data.frame(c("FPR" = qda.ROC@x.values, "TPR" = qda.ROC@y.values, "Threshold" = qda.ROC@alpha.values))
qda.roc.df$Threshold[1] <- 1 # replace infinity
qda.roc.df %>% ggplot(aes(FPR, TPR))+
geom_line(aes(color=Threshold), lwd = 2)+
scale_colour_viridis(option = "plasma")+
geom_abline(lwd = 1, color = "gray")+
theme_bw()+
theme(axis.line = element_line(color='gray'),
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank())+
ggtitle("ROC of QDA")+
annotate("text", x = 0.75, y = 0.25, label = "AUC = 0.9927")
# predict over boundary.grid.
boundary.probs <- predict(qda.tuning, boundary.grid, type = "prob")
boundary.df <- cbind(boundary.grid, boundary.probs[2])
new.holdout <- holdout
new.holdout$hitmiss <- factor(ifelse(qda.holdout.preds==holdout$class,'hit', 'miss'))
ggplot(data = NULL)+
geom_contour_filled(data = slice(boundary.df,which(boundary.df$green==117)),
breaks = seq(0,1.0001,length.out=14),
aes(x=red, y=blue, z=tarp), bins = 13)+
scale_fill_manual(values = colPal,guide = "none")+
theme(axis.line = element_line(color='gray'),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank())+
geom_jitter(data = new.holdout[which(new.holdout$green==117), ],
size = 2, stroke = 1.3,
aes(x=red, y=blue, color=class, shape = hitmiss))+
scale_color_manual(values=c("#964B00", "#153250"))+
scale_shape_manual(values=c(20,4))+
ggtitle("QDA Decision Boundary [Blue against Red]")
ggplot(data = NULL)+
geom_contour_filled(data = slice(boundary.df,which(boundary.df$red==140)),
breaks = seq(0,1.0001,length.out=14),
aes(x=green, y=blue, z=tarp), bins = 13)+
scale_fill_manual(values = colPal,guide = "none")+
theme(axis.line = element_line(color='gray'),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank())+
geom_jitter(data = new.holdout[which(new.holdout$red==140), ],
size = 2, stroke = 1.3,
aes(x=green, y=blue, color=class, shape = hitmiss))+
scale_color_manual(values=c("#964B00", "#153250"))+
scale_shape_manual(values=c(20,4))+
ggtitle("QDA Decision Boundary [Blue against Green]")
logistic.ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 5, seeds = seeds)
logistic.tuning <- train(class~.,
data = tuning,
subset = train,
method = "glm",
family = "binomial",
metric = "Accuracy",
trControl = logistic.ctrl)
logistic.tuning
## Generalized Linear Model
##
## 63241 samples
## 3 predictor
## 2 classes: 'other', 'tarp'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 28458, 28459, 28459, 28459, 28459, 28459, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9947314 0.9107111
Make prediction on the testing subset.
logistic.preds <- predict(logistic.tuning, newdata = testX)
confusionMatrix(logistic.preds, testLabels, positive = 'tarp')
## Confusion Matrix and Statistics
##
## Reference
## Prediction other tarp
## other 30579 99
## tarp 30 912
##
## Accuracy : 0.9959
## 95% CI : (0.9952, 0.9966)
## No Information Rate : 0.968
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9318
##
## Mcnemar's Test P-Value : 2.137e-09
##
## Sensitivity : 0.90208
## Specificity : 0.99902
## Pos Pred Value : 0.96815
## Neg Pred Value : 0.99677
## Prevalence : 0.03197
## Detection Rate : 0.02884
## Detection Prevalence : 0.02979
## Balanced Accuracy : 0.95055
##
## 'Positive' Class : tarp
##
logistic.holdout.preds <- predict(logistic.tuning, newdata = holdoutX)
confusionMatrix(logistic.holdout.preds, holdoutLabels, positive = 'tarp')
## Confusion Matrix and Statistics
##
## Reference
## Prediction other tarp
## other 1969423 159
## tarp 20274 14321
##
## Accuracy : 0.9898
## 95% CI : (0.9897, 0.9899)
## No Information Rate : 0.9928
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.5794
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.989019
## Specificity : 0.989811
## Pos Pred Value : 0.413962
## Neg Pred Value : 0.999919
## Prevalence : 0.007225
## Detection Rate : 0.007146
## Detection Prevalence : 0.017261
## Balanced Accuracy : 0.989415
##
## 'Positive' Class : tarp
##
# Predict Probabilities for ROC
logistic.holdout.probs <- predict(logistic.tuning, newdata = holdoutX, type = 'prob')
logistic.rates <- prediction(logistic.holdout.probs[2], holdoutLabels)
logistic.ROC <- performance(logistic.rates, measure = 'tpr', x.measure = 'fpr')
logistic.auc <- performance(logistic.rates, measure = "auc")
logistic.auc@y.values #auc = 0.9994541
## [[1]]
## [1] 0.9994541
# colorize ROC
logistic.roc.df <- data.frame(c("FPR" = logistic.ROC@x.values,
"TPR" = logistic.ROC@y.values,
"Threshold" = logistic.ROC@alpha.values))
logistic.roc.df$Threshold[1] <- 1 # replace infinity
logistic.roc.df %>% ggplot(aes(FPR, TPR))+
geom_line(aes(color=Threshold), lwd = 2)+
scale_colour_viridis(option = "plasma")+
geom_abline(lwd = 1, color = "gray")+
theme_bw()+
theme(axis.line = element_line(color='gray'),
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank())+
ggtitle("ROC of Logistic Regression")+
annotate("text", x = 0.75, y = 0.25, label = "AUC = 0.9995")
# predict over boundary.grid.
boundary.probs <- predict(logistic.tuning, boundary.grid, type = "prob")
boundary.df <- cbind(boundary.grid, boundary.probs[2])
new.holdout <- holdout
new.holdout$hitmiss <- factor(ifelse(logistic.holdout.preds==holdout$class,'hit', 'miss'))
ggplot(data = NULL)+
geom_contour_filled(data = slice(boundary.df,which(boundary.df$green==117)),
breaks = seq(0,1.0001,length.out=14),
aes(x=red, y=blue, z=tarp), bins = 13)+
scale_fill_manual(values = colPal,guide = "none")+
theme(axis.line = element_line(color='gray'),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank())+
geom_jitter(data = new.holdout[which(new.holdout$green==117), ],
size = 2, stroke = 1.3,
aes(x=red, y=blue, color=class, shape = hitmiss))+
scale_color_manual(values=c("#964B00", "#153250"))+
scale_shape_manual(values=c(20,4))+
ggtitle("Logistic Regression Decision Boundary [Blue against Red]")
ggplot(data = NULL)+
geom_contour_filled(data = slice(boundary.df,which(boundary.df$red==140)),
breaks = seq(0,1.0001,length.out=14),
aes(x=green, y=blue, z=tarp), bins = 13)+
scale_fill_manual(values = colPal,guide = "none")+
theme(axis.line = element_line(color='gray'),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank())+
geom_jitter(data = new.holdout[which(new.holdout$red==140), ],
size = 2, stroke = 1.3,
aes(x=green, y=blue, color=class, shape = hitmiss))+
scale_color_manual(values=c("#964B00", "#153250"))+
scale_shape_manual(values=c(20,4))+
ggtitle("Logistic Regression Decision Boundary [Blue against Green]")
rf.ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 5, seeds = seeds)
rf.tuning <- train(class~.,
data = tuning,
subset = train,
method = 'rf',
metric = 'Accuracy',
trControl = rf.ctrl,
tuneGrid = expand.grid(mtry = c(2,3)))
rf.tuning
## Random Forest
##
## 63241 samples
## 3 predictor
## 2 classes: 'other', 'tarp'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 28458, 28459, 28459, 28459, 28459, 28459, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9961292 0.9368615
## 3 0.9960216 0.9352254
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
rf.preds <- predict(rf.tuning, newdata = testX)
confusionMatrix(rf.preds, testLabels, positive = 'tarp')
## Confusion Matrix and Statistics
##
## Reference
## Prediction other tarp
## other 30553 41
## tarp 56 970
##
## Accuracy : 0.9969
## 95% CI : (0.9963, 0.9975)
## No Information Rate : 0.968
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9508
##
## Mcnemar's Test P-Value : 0.1552
##
## Sensitivity : 0.95945
## Specificity : 0.99817
## Pos Pred Value : 0.94542
## Neg Pred Value : 0.99866
## Prevalence : 0.03197
## Detection Rate : 0.03068
## Detection Prevalence : 0.03245
## Balanced Accuracy : 0.97881
##
## 'Positive' Class : tarp
##
rf.holdout.preds <- predict(rf.tuning, newdata = holdoutX)
confusionMatrix(rf.holdout.preds, holdoutLabels, positive = 'tarp')
## Confusion Matrix and Statistics
##
## Reference
## Prediction other tarp
## other 1974603 2336
## tarp 15094 12144
##
## Accuracy : 0.9913
## 95% CI : (0.9912, 0.9914)
## No Information Rate : 0.9928
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.5782
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.838674
## Specificity : 0.992414
## Pos Pred Value : 0.445848
## Neg Pred Value : 0.998818
## Prevalence : 0.007225
## Detection Rate : 0.006059
## Detection Prevalence : 0.013591
## Balanced Accuracy : 0.915544
##
## 'Positive' Class : tarp
##
# Predict Probabilities for ROC
rf.holdout.probs <- predict(rf.tuning, newdata = holdoutX, type = 'prob')
rf.rates <- prediction(rf.holdout.probs[2], holdoutLabels)
rf.ROC <- performance(rf.rates, measure = "tpr", x.measure = "fpr")
rf.auc <- performance(rf.rates, measure = "auc")
rf.auc@y.values #auc = 0.9798993
## [[1]]
## [1] 0.9798993
# colorize ROC
rf.roc.df <- data.frame(c("FPR" = rf.ROC@x.values, "TPR" = rf.ROC@y.values, "Threshold" = rf.ROC@alpha.values))
rf.roc.df$Threshold[1] <- 1 # replace infinity
rf.roc.df %>% ggplot(aes(FPR, TPR))+
geom_line(aes(color=Threshold), lwd = 2)+
scale_colour_viridis(option = "plasma")+
geom_abline(lwd = 1, color = "gray")+
theme_bw()+
theme(axis.line = element_line(color='gray'),
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank())+
ggtitle("ROC of Random Forest (mtry = 2)")+
annotate("text", x = 0.75, y = 0.25, label = "AUC = 0.9799")
# predict over boundary.grid.
boundary.probs <- predict(rf.tuning, boundary.grid, type = "prob")
boundary.df <- cbind(boundary.grid, boundary.probs[2])
new.holdout <- holdout
new.holdout$hitmiss <- factor(ifelse(rf.holdout.preds==holdout$class,'hit', 'miss'))
ggplot(data = NULL)+
geom_contour_filled(data = slice(boundary.df,which(boundary.df$green==117)),
breaks = seq(0,1.0001,length.out=14),
aes(x=red, y=blue, z=tarp), bins = 13)+
scale_fill_manual(values = colPal,guide = "none")+
theme(axis.line = element_line(color='gray'),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank())+
geom_jitter(data = new.holdout[which(new.holdout$green==117), ],
size = 2, stroke = 1.3,
aes(x=red, y=blue, color=class, shape = hitmiss))+
scale_color_manual(values=c("#964B00", "#153250"))+
scale_shape_manual(values=c(20,4))+
ggtitle("Random Forest Decision Boundary [Blue against Red]")
ggplot(data = NULL)+
geom_contour_filled(data = slice(boundary.df,which(boundary.df$red==140)),
breaks = seq(0,1.0001,length.out=14),
aes(x=green, y=blue, z=tarp), bins = 13)+
scale_fill_manual(values = colPal,guide = "none")+
theme(axis.line = element_line(color='gray'),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank())+
geom_jitter(data = new.holdout[which(new.holdout$red==140), ],
size = 2, stroke = 1.3,
aes(x=green, y=blue, color=class, shape = hitmiss))+
scale_color_manual(values=c("#964B00", "#153250"))+
scale_shape_manual(values=c(20,4))+
ggtitle("Random Forest Decision Boundary [Blue against Green]")
set.seed(199)
svmP.seeds <- vector(mode = "list", length = 51)
for(i in 1:(length(svmP.seeds)-1)) svmP.seeds[[i]]<- sample.int(n=1000, 15)
svmP.seeds[[51]]<-sample.int(1000, 1)
# We create a custom SVM with Polynomial kernel so that we can tune over offset.
# The default svmPoly offered by train sets offset = 1. This should be tunable--
# or at the very least, we should be able to set offset = 0.
svmPolyCustom <- list(type = "Classification",
library = "kernlab",
loop = NULL)
prm <- data.frame(parameter = c("degree", "scale", "offset", "C"),
class = rep("numeric", 4),
label = c("Degree", "Scale", "Offset", "Cost"))
svmPolyCustom$parameters <- prm
svmGrid <- function(x, y, len = NULL, search = "grid") {
if(search == "grid") {
out <- expand.grid(degree = seq(1, min(len, 3)),
scale = 10 ^((1:len) - 4),
offset = (-1)^(1:len)*ceiling((0:(len-1))/2),
C = 2 ^((1:len) - 3))
} else {
out <- data.frame(degree = sample(1:3, size = len, replace = TRUE),
scale = 10^runif(len, min = -5, log10(2)),
offset = runif(len, min = -len, max = len),
C = 2^runif(len, min = -5, max = 10))
}
out
}
svmPolyCustom$grid <- svmGrid
svmFit <- function(x, y, wts, param, lev, last, classProbs, ...) {
if(any(names(list(...)) == "prob.model") | is.numeric(y)) {
out <- kernlab::ksvm(x = as.matrix(x),
y = y,
kernel = kernlab::polydot(degree = param$degree,
scale = param$scale,
offset = param$offset),
C = param$C, ...)
} else {
out <- kernlab::ksvm(x = as.matrix(x),
y = y,
kernel = kernlab::polydot(degree = param$degree,
scale = param$scale,
offset = 0),
C = param$C,
prob.model = classProbs,
...)
}
out
}
svmPolyCustom$fit <- svmFit
svmPred <- function(modelFit, newdata, submodels = NULL) {
svmPred <- function(obj, x) {
hasPM <- !is.null(unlist(obj@prob.model))
if(hasPM) {
pred <- kernlab::lev(obj)[apply(kernlab::predict(obj, x, type = "probabilities"),
1, which.max)]
} else pred <- kernlab::predict(obj, x)
pred
}
out <- try(svmPred(modelFit, newdata), silent = TRUE)
if(is.character(kernlab::lev(modelFit))) {
if(class(out)[1] == "try-error") {
warning("kernlab class prediction calculations failed; returning NAs")
out <- rep("", nrow(newdata))
out[seq(along = out)] <- NA
}
} else {
if(class(out)[1] == "try-error") {
warning("kernlab prediction calculations failed; returning NAs")
out <- rep(NA, nrow(newdata))
}
}
if(is.matrix(out)) out <- out[,1]
out
}
svmPolyCustom$predict <- svmPred
svmProb <- function(modelFit, newdata, submodels = NULL) {
out <- try(kernlab::predict(modelFit, newdata, type="probabilities"),
silent = TRUE)
if(class(out)[1] != "try-error") {
## There are times when the SVM probability model will
## produce negative class probabilities, so we
## induce vlaues between 0 and 1
if(any(out < 0)) {
out[out < 0] <- 0
out <- t(apply(out, 1, function(x) x/sum(x)))
}
out <- out[, kernlab::lev(modelFit), drop = FALSE]
} else {
warning("kernlab class probability calculations failed; returning NAs")
out <- matrix(NA, nrow(newdata) * length(kernlab::lev(modelFit)), ncol = length(kernlab::lev(modelFit)))
colnames(out) <- kernlab::lev(modelFit)
}
out
}
svmPolyCustom$prob <- svmProb
svmSort <- function(x) x[order(x$degree, x$C, x$scale),]
svmPolyCustom$sort <- svmSort
svmLevels <- function(x) kernlab::lev(x)
svmPolyCustom$levels <- svmLevels
svmP.grid2 <- expand.grid(degree = c(2,3,4),
scale = 1,
offset = 0,
C = c(0.1,1,10,100,1000))
svmP.control2 <- trainControl(method = "repeatedcv",
number = 10,
repeats = 5,
seeds = svmP.seeds,
classProbs = T)
svmP.tuning2 <- train(class~.,
data = tuning,
subset = train,
method = svmPolyCustom,
metric = 'Accuracy',
trControl = svmP.control2,
tuneGrid = svmP.grid2,
preProcess = c("center","scale"))
## line search fails -2.049865 1.885835 1.061595e-05 -3.666528e-06 -4.598396e-08 -2.705361e-08 -3.889704e-13
svmP.tuning2
## 63241 samples
## 3 predictor
## 2 classes: 'other', 'tarp'
##
## Pre-processing: centered (3), scaled (3)
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 28458, 28459, 28459, 28459, 28459, 28459, ...
## Resampling results across tuning parameters:
##
## degree C Accuracy Kappa
## 2 1e-01 0.9934221 0.8838784
## 2 1e+00 0.9937700 0.8910470
## 2 1e+01 0.9936688 0.8889803
## 2 1e+02 0.9937257 0.8898182
## 2 1e+03 0.9920749 0.8339831
## 3 1e-01 0.9940040 0.8956550
## 3 1e+00 0.9947440 0.9116591
## 3 1e+01 0.9951488 0.9186073
## 3 1e+02 0.9950350 0.9162175
## 3 1e+03 0.9950539 0.9167053
## 4 1e-01 0.9921318 0.8567759
## 4 1e+00 0.9932450 0.8796333
## 4 1e+01 0.9933715 0.8819879
## 4 1e+02 0.9933336 0.8808683
## 4 1e+03 0.9824001 0.5572732
##
## Tuning parameter 'scale' was held constant at a value of 1
## Tuning
## parameter 'offset' was held constant at a value of 0
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were degree = 3, scale = 1, offset = 0
## and C = 10.
svmP.tuning2$finalModel
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 10
##
## Polynomial kernel function.
## Hyperparameters : degree = 3 scale = 1 offset = 0
##
## Number of Support Vectors : 405
##
## Objective Function Value : -3464.73
## Training error : 0.004839
## Probability model included.
svmP.tuning2.preds <- predict(svmP.tuning2, newdata = testX)
confusionMatrix(svmP.tuning2.preds, testLabels, positive = 'tarp')
## Confusion Matrix and Statistics
##
## Reference
## Prediction other tarp
## other 30565 83
## tarp 44 928
##
## Accuracy : 0.996
## 95% CI : (0.9952, 0.9967)
## No Information Rate : 0.968
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9339
##
## Mcnemar's Test P-Value : 0.0007464
##
## Sensitivity : 0.91790
## Specificity : 0.99856
## Pos Pred Value : 0.95473
## Neg Pred Value : 0.99729
## Prevalence : 0.03197
## Detection Rate : 0.02935
## Detection Prevalence : 0.03074
## Balanced Accuracy : 0.95823
##
## 'Positive' Class : tarp
##
svmP.holdout.preds2 <- predict(svmP.tuning2, newdata = holdoutX)
confusionMatrix(svmP.holdout.preds2, holdoutLabels, positive = 'tarp')
## Confusion Matrix and Statistics
##
## Reference
## Prediction other tarp
## other 1978270 133
## tarp 11427 14347
##
## Accuracy : 0.9942
## 95% CI : (0.9941, 0.9943)
## No Information Rate : 0.9928
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7101
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.990815
## Specificity : 0.994257
## Pos Pred Value : 0.556646
## Neg Pred Value : 0.999933
## Prevalence : 0.007225
## Detection Rate : 0.007159
## Detection Prevalence : 0.012860
## Balanced Accuracy : 0.992536
##
## 'Positive' Class : tarp
##
# Predict Probabilities for ROC
svmP.holdout.probs2 <- predict(svmP.tuning2, newdata = holdoutX, type = "prob")
svmP.rates2 <- prediction(svmP.holdout.probs2[2], holdoutLabels)
svmP.ROC2 <- performance(svmP.rates2, measure = "tpr", x.measure = "fpr")
svmP.auc2 <- performance(svmP.rates2, measure = "auc")
svmP.auc2@y.values #auc = 0.9991007
## [[1]]
## [1] 0.9991007
# colorize ROC.
svmP.roc.df2 <- data.frame(c("FPR" = svmP.ROC2@x.values, "TPR" = svmP.ROC2@y.values, "Threshold" = svmP.ROC2@alpha.values))
svmP.roc.df2$Threshold[1] <- 1 # replace infinity
svmP.roc.df2 %>% ggplot(aes(FPR, TPR))+
geom_line(aes(color=Threshold), lwd = 2)+
scale_colour_viridis(option = "plasma")+
geom_abline(lwd = 1, color = "gray")+
theme_bw()+
theme(axis.line = element_line(color='gray'),
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank())+
ggtitle("ROC of SVM [Polynomial: degree = 3, cost = 10]")+
annotate("text", x = 0.75, y = 0.25, label = "AUC = 0.9991")
# predict over boundary.grid.
boundary.probs <- predict(svmP.tuning2, boundary.grid, type = "prob")
boundary.df <- cbind(boundary.grid, boundary.probs[2])
new.holdout <- holdout
new.holdout$hitmiss <- factor(ifelse(svmP.holdout.preds2==holdout$class,'hit', 'miss'))
ggplot(data = NULL)+
geom_contour_filled(data = slice(boundary.df,which(boundary.df$green==117)),
breaks = seq(0,1.0001,length.out=14),
aes(x=red, y=blue, z=tarp), bins = 13)+
scale_fill_manual(values = colPal,guide = "none")+
theme(axis.line = element_line(color='gray'),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank())+
geom_jitter(data = new.holdout[which(new.holdout$green==117), ],
size = 2, stroke = 1.3,
aes(x=red, y=blue, color=class, shape = hitmiss))+
scale_color_manual(values=c("#964B00", "#153250"))+
scale_shape_manual(values=c(20,4))+
ggtitle("SVM [Polynomial d = 3] Decision Boundary [Blue against Red]")
ggplot(data = NULL)+
geom_contour_filled(data = slice(boundary.df,which(boundary.df$red==140)),
breaks = seq(0,1.0001,length.out=14),
aes(x=green, y=blue, z=tarp), bins = 13)+
scale_fill_manual(values = colPal,guide = "none")+
theme(axis.line = element_line(color='gray'),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank())+
geom_jitter(data = new.holdout[which(new.holdout$red==140), ],
size = 2, stroke = 1.3,
aes(x=green, y=blue, color=class, shape = hitmiss))+
scale_color_manual(values=c("#964B00", "#153250"))+
scale_shape_manual(values=c(20,4))+
ggtitle("SVM [Polynomial d = 3] Decision Boundary [Blue against Green]")
set.seed(199)
svmR.seeds <- vector(mode = "list", length = 51)
for(i in 1:(length(svmR.seeds)-1)) svmR.seeds[[i]]<- sample.int(n=1000, 25)
svmR.seeds[[51]]<-sample.int(1000, 1)
svmR.grid <- expand.grid(sigma = c(0.5,1,2,5,10),
C = c(0.1,1,10,100,1000))
svmR.control1 <- trainControl(method = "repeatedcv",
number = 10,
repeats = 5,
seeds = svmR.seeds,
classProbs = T)
svmR.tuning1 <- train(class~.,
data = tuning,
subset = train,
method = 'svmRadial',
metric = 'Accuracy',
trControl = svmR.control1,
tuneGrid = svmR.grid,
preProcess = c("center","scale"))
svmR.tuning1
## Support Vector Machines with Radial Basis Function Kernel
##
## 63241 samples
## 3 predictor
## 2 classes: 'other', 'tarp'
##
## Pre-processing: centered (3), scaled (3)
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 28459, 28459, 28459, 28459, 28459, 28459, ...
## Resampling results across tuning parameters:
##
## sigma C Accuracy Kappa
## 0.5 1e-01 0.9953132 0.9241391
## 0.5 1e+00 0.9957560 0.9311194
## 0.5 1e+01 0.9960533 0.9356457
## 0.5 1e+02 0.9965972 0.9444995
## 0.5 1e+03 0.9967553 0.9470093
## 1.0 1e-01 0.9953702 0.9249098
## 1.0 1e+00 0.9959837 0.9346783
## 1.0 1e+01 0.9963505 0.9405322
## 1.0 1e+02 0.9967364 0.9467084
## 1.0 1e+03 0.9966731 0.9457134
## 2.0 1e-01 0.9955156 0.9269026
## 2.0 1e+00 0.9959457 0.9336071
## 2.0 1e+01 0.9965972 0.9445584
## 2.0 1e+02 0.9967111 0.9464128
## 2.0 1e+03 0.9967933 0.9476478
## 5.0 1e-01 0.9959900 0.9344407
## 5.0 1e+00 0.9962683 0.9385540
## 5.0 1e+01 0.9967237 0.9462793
## 5.0 1e+02 0.9968439 0.9486944
## 5.0 1e+03 0.9969388 0.9504169
## 10.0 1e-01 0.9961734 0.9372800
## 10.0 1e+00 0.9965846 0.9439289
## 10.0 1e+01 0.9967617 0.9467812
## 10.0 1e+02 0.9968502 0.9485227
## 10.0 1e+03 0.9965023 0.9429719
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 5 and C = 1000.
svmR.tuning1$finalModel
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 1000
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 5
##
## Number of Support Vectors : 239
##
## Objective Function Value : -168491.2
## Training error : 0.002245
svmR.tuning1.preds <- predict(svmR.tuning1, newdata = testX)
confusionMatrix(svmR.tuning1.preds, testLabels, positive = 'tarp')
## Confusion Matrix and Statistics
##
## Reference
## Prediction other tarp
## other 30558 28
## tarp 51 983
##
## Accuracy : 0.9975
## 95% CI : (0.9969, 0.998)
## No Information Rate : 0.968
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.9601
##
## Mcnemar's Test P-Value : 0.01332
##
## Sensitivity : 0.97230
## Specificity : 0.99833
## Pos Pred Value : 0.95068
## Neg Pred Value : 0.99908
## Prevalence : 0.03197
## Detection Rate : 0.03109
## Detection Prevalence : 0.03270
## Balanced Accuracy : 0.98532
##
## 'Positive' Class : tarp
##
svmR.holdout.preds <- predict(svmR.tuning1, newdata = holdoutX)
confusionMatrix(svmR.holdout.preds, holdoutLabels, positive = 'tarp')
## Confusion Matrix and Statistics
##
## Reference
## Prediction other tarp
## other 1974633 3595
## tarp 15064 10885
##
## Accuracy : 0.9907
## 95% CI : (0.9906, 0.9908)
## No Information Rate : 0.9928
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.5342
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.751727
## Specificity : 0.992429
## Pos Pred Value : 0.419477
## Neg Pred Value : 0.998183
## Prevalence : 0.007225
## Detection Rate : 0.005431
## Detection Prevalence : 0.012947
## Balanced Accuracy : 0.872078
##
## 'Positive' Class : tarp
##
# Predict Probabilities for ROC
svmR.holdout.probs <- predict(svmR.tuning1, newdata = holdoutX, type = 'prob')
svmR.rates <- prediction(svmR.holdout.probs[2], holdoutLabels)
svmR.ROC <- performance(svmR.rates, measure = "tpr", x.measure = "fpr")
svmR.auc <- performance(svmR.rates, measure = "auc")
svmR.auc@y.values #auc = 0.986921
## [[1]]
## [1] 0.986921
# colorize ROC
svmR.roc.df <- data.frame(c("FPR" = svmR.ROC@x.values, "TPR" = svmR.ROC@y.values, "Threshold" = svmR.ROC@alpha.values))
svmR.roc.df$Threshold[1] <- 1 # replace infinity
svmR.roc.df %>% ggplot(aes(FPR, TPR))+
geom_line(aes(color=Threshold), lwd = 2)+
scale_colour_viridis(option = "plasma")+
geom_abline(lwd = 1, color = "gray")+
theme_bw()+
theme(axis.line = element_line(color='gray'),
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank())+
ggtitle("ROC of SVM [RBF: gamma = 5, cost = 1000]")+
annotate("text", x = 0.75, y = 0.25, label = "AUC = 0.9869")
# predict over boundary.grid.
boundary.probs <- predict(svmR.tuning1, boundary.grid, type = "prob")
boundary.df <- cbind(boundary.grid, boundary.probs[2])
new.holdout <- holdout
new.holdout$hitmiss <- factor(ifelse(svmR.holdout.preds==holdout$class,'hit', 'miss'))
ggplot(data = NULL)+
geom_contour_filled(data = slice(boundary.df,which(boundary.df$green==117)),
breaks = seq(0,1.0001,length.out=14),
aes(x=red, y=blue, z=tarp), bins = 13)+
scale_fill_manual(values = colPal,guide = "none")+
theme(axis.line = element_line(color='gray'),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank())+
geom_jitter(data = new.holdout[which(new.holdout$green==117), ],
size = 2, stroke = 1.3,
aes(x=red, y=blue, color=class, shape = hitmiss))+
scale_color_manual(values=c("#964B00", "#153250"))+
scale_shape_manual(values=c(20,4))+
ggtitle("SVM RBF
Decision Boundary [Blue against Red]")
ggplot(data = NULL)+
geom_contour_filled(data = slice(boundary.df,which(boundary.df$red==140)),
breaks = seq(0,1.0001,length.out=14),
aes(x=green, y=blue, z=tarp), bins = 13)+
scale_fill_manual(values = colPal,guide = "none")+
theme(axis.line = element_line(color='gray'),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank())+
geom_jitter(data = new.holdout[which(new.holdout$red==140), ],
size = 2, stroke = 1.3,
aes(x=green, y=blue, color=class, shape = hitmiss))+
scale_color_manual(values=c("#964B00", "#153250"))+
scale_shape_manual(values=c(20,4))+
ggtitle("SVM RBF Decision Boundary [Blue against Green]")
We evaluate the accuracy of Professor Basener’s Mahalanobis Distance Classifier. The only parameter for this method is the p-value threshold used to label the testing points. Ideally, we would tune this with cross-validation. In the interest of simplicity, we just use a test/train split.
library(MASS)
# create copy of tuning
training.copy <- tuning[train, ]
testing.copy <- tuning[-train, ]
# create df with only tarps
tarpsOnly <- training.copy[which(training.copy$class == "tarp"), ]
# create dfs with only RGB values for
tarpsOnly_rgb <- tarpsOnly[, -1]
testing.copy_rgb <- testing.copy[, -1]
# Compute Mahalanobis Distances
testing.copy$mahalanobis <- mahalanobis(testing.copy_rgb,
colMeans(tarpsOnly_rgb),
cov(tarpsOnly_rgb))
testing.copy$pvalue <- pchisq(testing.copy$mahalanobis, df=3, lower.tail=FALSE)
# idea: create ROC-like plot...
# idea: line plot accuracy (and TPR and FPR) against Mahalanobis Dist to select cut off
# use p-values to set threshold
threshold.mesh <- seq(0.01,0.99,length.out=99)
acc <- c()
tpr <- c()
fpr <- c()
ppv <- c()
for (tr in 1:length(threshold.mesh)){
predicted.labels <- ifelse(testing.copy$pvalue>=threshold.mesh[tr], "tarp", "other")
cf.matrix <- table(predicted.labels, testing.copy$class)
acc[tr] <- (cf.matrix[1,1]+cf.matrix[2,2])/sum(cf.matrix)
tpr[tr] <- cf.matrix[2,2]/(cf.matrix[2,2]+cf.matrix[1,2])
fpr[tr] <- cf.matrix[2,1]/(cf.matrix[2,1]+cf.matrix[1,1])
ppv[tr] <- cf.matrix[2,2]/(cf.matrix[2,2]+cf.matrix[2,1])
}
threshold.df <- cbind(threshold.mesh, acc, tpr, fpr, ppv) %>% data.frame()
colnames(threshold.df) <- c("threshold", "accuracy", "TPR", "FPR", "PPV")
threshold.df$threshold <- as.numeric(threshold.df$threshold)
#ideal threshold based on overall accuracy
(best.thresh <- threshold.df$threshold[which.max(threshold.df$accuracy)])
## [1] 0.27
plot(x=threshold.df$threshold, y=threshold.df$accuracy,
xlab = "p-value Threshold",
ylab = " Accuracy",
main = "Prediction Accuracy against p-value Threshold",
type = "l")
The test/train split chooses a p-value threshold of 0.27. Observations with p-values greater than 0.27 are classified as tarp, and observations with p-values below 0.27 are classified as other. We now, view the confusion matrix for the testing data at this threshold.
Mah.labels <- ifelse(testing.copy$pvalue>=best.thresh, "tarp", "other") %>% factor()
confusionMatrix(Mah.labels, testing.copy$class, positive = "tarp")
## Confusion Matrix and Statistics
##
## Reference
## Prediction other tarp
## other 30506 266
## tarp 103 745
##
## Accuracy : 0.9883
## 95% CI : (0.9871, 0.9895)
## No Information Rate : 0.968
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7955
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.73689
## Specificity : 0.99663
## Pos Pred Value : 0.87854
## Neg Pred Value : 0.99136
## Prevalence : 0.03197
## Detection Rate : 0.02356
## Detection Prevalence : 0.02682
## Balanced Accuracy : 0.86676
##
## 'Positive' Class : tarp
##
We now apply the classifier to the holdout data.
#create copy of holdout data
holdout.copy <- holdout
holdout.copy_rgb <- holdout.copy[,-4]
# compute Mah Dist and p-values for each obsv in holdout
holdout.copy$mahalanobis <- mahalanobis(holdout.copy_rgb,
colMeans(tarpsOnly_rgb),
cov(tarpsOnly_rgb))
holdout.copy$pvalue <- pchisq(holdout.copy$mahalanobis, df=3, lower.tail=FALSE)
# ----
acc.h <- c()
tpr.h <- c()
fpr.h <- c()
ppv.h <- c()
for (tr in 1:length(threshold.mesh)){
predicted.labels <- ifelse(holdout.copy$pvalue>=threshold.mesh[tr], "tarp", "other")
cf.matrix <- table(predicted.labels, holdout.copy$class)
acc.h[tr] <- (cf.matrix[1,1]+cf.matrix[2,2])/sum(cf.matrix)
tpr.h[tr] <- cf.matrix[2,2]/(cf.matrix[2,2]+cf.matrix[1,2])
fpr.h[tr] <- cf.matrix[2,1]/(cf.matrix[2,1]+cf.matrix[1,1])
ppv.h[tr] <- cf.matrix[2,2]/(cf.matrix[2,2]+cf.matrix[2,1])
}
threshold.df.h <- cbind(threshold.mesh, acc.h, tpr.h, fpr.h, ppv.h) %>% data.frame()
colnames(threshold.df.h) <- c("threshold", "accuracy", "TPR", "FPR", "PPV")
threshold.df.h$threshold <- as.numeric(threshold.df.h$threshold)
#ideal threshold based on overall accuracy
(best.thresh.h <- threshold.df.h$threshold[which.max(threshold.df.h$accuracy)])
## [1] 0.53
plot(x=threshold.df.h$threshold, y=threshold.df.h$accuracy,
xlab = "p-value Threshold",
ylab = " Accuracy",
main = "Prediction Accuracy against p-value Threshold",
type = "l")
# apply labels using p-value threshold = 0.27
predicted.labels.holdout <- ifelse(holdout.copy$pvalue >= best.thresh.h, "tarp", "other") %>% factor()
# confusionMatrix
confusionMatrix(predicted.labels.holdout, holdout.copy$class, positive = "tarp")
## Confusion Matrix and Statistics
##
## Reference
## Prediction other tarp
## other 1989116 13082
## tarp 581 1398
##
## Accuracy : 0.9932
## 95% CI : (0.9931, 0.9933)
## No Information Rate : 0.9928
## P-Value [Acc > NIR] : 3.128e-12
##
## Kappa : 0.1684
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.0965470
## Specificity : 0.9997080
## Pos Pred Value : 0.7064174
## Neg Pred Value : 0.9934662
## Prevalence : 0.0072249
## Detection Rate : 0.0006975
## Detection Prevalence : 0.0009874
## Balanced Accuracy : 0.5481275
##
## 'Positive' Class : tarp
##
mahalanobis.holdout.probs <- holdout.copy$pvalue
mahalanobis.rates <- prediction(mahalanobis.holdout.probs, holdout.copy$class)
mahalanobis.ROC <- performance(mahalanobis.rates, measure = 'tpr', x.measure = 'fpr')
mahalanobis.auc <- performance(mahalanobis.rates, measure = "auc")
mahalanobis.auc@y.values #auc = 0.7543164
## [[1]]
## [1] 0.762413
# colorize ROC
mahalanobis.roc.df <- data.frame(c("FPR" = mahalanobis.ROC@x.values,
"TPR" = mahalanobis.ROC@y.values,
"Threshold" = mahalanobis.ROC@alpha.values))
mahalanobis.roc.df$Threshold[1] <- 1 # replace infinity
mahalanobis.roc.df %>% ggplot(aes(FPR, TPR))+
geom_line(aes(color=Threshold), lwd = 2)+
scale_colour_viridis(option = "plasma")+
geom_abline(lwd = 1, color = "gray")+
theme_bw()+
theme(axis.line = element_line(color='gray'),
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank())+
ggtitle("ROC of Mahalanobis Distance Classifier")+
annotate("text", x = 0.75, y = 0.25, label = "AUC = 0.7624")
boundary.dist <- mahalanobis(boundary.grid,
colMeans(tarpsOnly_rgb),
cov(tarpsOnly_rgb))
boundary.probs <- pchisq(boundary.dist, df = 3, lower.tail = F)
boundary.df <- cbind(boundary.grid, boundary.probs, boundary.dist)
holdout.copy$labels <- ifelse(holdout.copy$pvalue >= best.thresh.h, "tarp", "other")
holdout.copy$hitmiss <- factor(ifelse(holdout.copy$labels==holdout.copy$class,'hit', 'miss'))
ggplot(data = NULL)+
geom_contour_filled(data = slice(boundary.df,which(boundary.df$green==186)),
breaks =seq(0,1,length.out = 14),
aes(x=red, y=blue, z=boundary.probs), bins = 13)+
scale_fill_manual(values = colPal, guide = "none")+
theme(axis.line = element_line(color='gray'),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank())+
geom_jitter(data = holdout.copy[which(holdout.copy$green==186), ],
size = 2, stroke = 1.3,
aes(x=red, y=blue, color=class, shape = hitmiss))+
scale_color_manual(values=c("#964B00", "#153250"))+
scale_shape_manual(values=c(20,4))+
ggtitle("Mahalanobis Classifier Decision Boundary [Blue against Red]")
ggplot(data = NULL)+
geom_contour_filled(data = slice(boundary.df,which(boundary.df$red==140)),
breaks = seq(0,1,length.out = 14),
aes(x=green, y=blue, z=boundary.probs), bins = 13)+
scale_fill_manual(values = colPal, guide = "none")+
theme(axis.line = element_line(color='gray'),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank())+
geom_jitter(data = holdout.copy[which(holdout.copy$red==140), ],
size = 2, stroke = 1.3,
aes(x=green, y=blue, color = class, shape = hitmiss))+
scale_color_manual(values=c("#964B00", "#153250"))+
scale_shape_manual(values=c(20,4))+
ggtitle("Mahalanobis Classifier Decision Boundary [Blue against Green]")
The overall accuracy of the Mahalanobis Classifier method is quite good, and comparable to the other methods tested. However, the sensitivity (TPR) is exceptionally poor. We ask, will creating a PCA projection of the RGB values lead to a significant increase in sensitivity?
# Create Principal Components from training subset; save loadings
train_pc <- prcomp(training.copy[, 2:4], center = T, scale = T)
loadings <- train_pc$rotation
# reattach labels
train_pc_copy <- train_pc$x %>% data.frame()
train_pc_copy$class <- tuning[train, "class"]
#section out tarps
tarpsOnly_pc <- train_pc_copy[which(train_pc_copy$class == "tarp"), ]
tarpsOnly_pc_rgb <- tarpsOnly_pc[,1:3]
holdout_pc_rgb <- scale(holdout.copy[1:3], train_pc$center, train_pc$scale) %*% loadings
holdout_pc_copy <- holdout_pc_rgb %>% data.frame()
holdout_pc_copy$class <- holdout$class
# calculate mahalanobis dist
holdout_pc_copy$mahalanobis <- mahalanobis(holdout_pc_rgb,
colMeans(tarpsOnly_pc_rgb),
cov(tarpsOnly_pc_rgb))
holdout_pc_copy$pvalue <- pchisq(holdout_pc_copy$mahalanobis, df=3, lower.tail=FALSE)
# ----
acc.h <- c()
tpr.h <- c()
fpr.h <- c()
ppv.h <- c()
for (tr in 1:length(threshold.mesh)){
predicted.labels <- ifelse(holdout_pc_copy$pvalue>=threshold.mesh[tr], "tarp", "other")
cf.matrix <- table(predicted.labels, holdout_pc_copy$class)
acc.h[tr] <- (cf.matrix[1,1]+cf.matrix[2,2])/sum(cf.matrix)
tpr.h[tr] <- cf.matrix[2,2]/(cf.matrix[2,2]+cf.matrix[1,2])
fpr.h[tr] <- cf.matrix[2,1]/(cf.matrix[2,1]+cf.matrix[1,1])
ppv.h[tr] <- cf.matrix[2,2]/(cf.matrix[2,2]+cf.matrix[2,1])
}
threshold.df.h <- cbind(threshold.mesh, acc.h, tpr.h, fpr.h, ppv.h) %>% data.frame()
colnames(threshold.df.h) <- c("threshold", "accuracy", "TPR", "FPR", "PPV")
threshold.df.h$threshold <- as.numeric(threshold.df.h$threshold)
#ideal threshold based on overall accuracy
(best.thresh.h <- threshold.df.h$threshold[which.max(threshold.df.h$accuracy)])
## [1] 0.53
plot(x=threshold.df.h$threshold, y=threshold.df.h$accuracy,
xlab = "p-value Threshold",
ylab = " Accuracy",
main = "Prediction Accuracy against p-value Threshold",
type = "l")
# apply labels using p-value threshold = 0.27
predicted.labels.holdout <- ifelse(holdout_pc_copy$pvalue >= best.thresh.h, "tarp", "other") %>% factor()
# confusionMatrix
confusionMatrix(predicted.labels.holdout, holdout_pc_copy$class, positive = "tarp")
## Confusion Matrix and Statistics
##
## Reference
## Prediction other tarp
## other 1989116 13082
## tarp 581 1398
##
## Accuracy : 0.9932
## 95% CI : (0.9931, 0.9933)
## No Information Rate : 0.9928
## P-Value [Acc > NIR] : 3.128e-12
##
## Kappa : 0.1684
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.0965470
## Specificity : 0.9997080
## Pos Pred Value : 0.7064174
## Neg Pred Value : 0.9934662
## Prevalence : 0.0072249
## Detection Rate : 0.0006975
## Detection Prevalence : 0.0009874
## Balanced Accuracy : 0.5481275
##
## 'Positive' Class : tarp
##
# ---
mahalanobis.holdout.probs <- holdout_pc_copy$pvalue
mahalanobis.rates <- prediction(mahalanobis.holdout.probs, holdout_pc_copy$class)
mahalanobis.ROC <- performance(mahalanobis.rates, measure = 'tpr', x.measure = 'fpr')
mahalanobis.auc <- performance(mahalanobis.rates, measure = "auc")
mahalanobis.auc@y.values #auc = 0.7543164
## [[1]]
## [1] 0.762413
# colorize ROC
mahalanobis.roc.df <- data.frame(c("FPR" = mahalanobis.ROC@x.values,
"TPR" = mahalanobis.ROC@y.values,
"Threshold" = mahalanobis.ROC@alpha.values))
mahalanobis.roc.df$Threshold[1] <- 1 # replace infinity
mahalanobis.roc.df %>% ggplot(aes(FPR, TPR))+
geom_line(aes(color=Threshold), lwd = 2)+
scale_colour_viridis(option = "plasma")+
geom_abline(lwd = 1, color = "gray")+
theme_bw()+
theme(axis.line = element_line(color='gray'),
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank())+
ggtitle("ROC of Mahalanobis Distance Classifier")+
annotate("text", x = 0.75, y = 0.25, label = "AUC = 0.7624")
cov(tarpsOnly_pc_rgb)
## PC1 PC2 PC3
## PC1 1.07466140 0.2606116466 -0.0376292406
## PC2 0.26061165 0.1200329881 0.0009582995
## PC3 -0.03762924 0.0009582995 0.0087330840
cov(tarpsOnly_rgb)
## red green blue
## red 1221.296 1480.892 1484.532
## green 1480.892 1917.161 2004.784
## blue 1484.532 2004.784 2437.707